1 What Does Hive DML Do?

2 Load Comma-Delimited Data File

CREATE TABLE IF NOT EXISTS employees12 (
  name STRING COMMENT 'Employee name',
  salary FLOAT COMMENT 'Employee salary',
  FexTax FLOAT,
  StateTax FLOAT,
  Insurance FLOAT,
  address STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n';
Mary Smith,80000.0,.2,.05,.1,100 Ontario St. Chicago IL 60601
Todd Jones,70000.0,.15,.03,.1,108 Oak Park Chicago IL 60700
Bill King,60000.0,.15,.03,.1,10 Ontario St. Chicago IL 60700
Sam Kim,110000.0,.2,.05,.1,101 Uptown Rd. Ithaca NY 14850
Maria Booth,60000.0,.15,.03,.1,46 South Gate Dr. State College PA 16801
Joe Howe,75000.0,.2,.5,.1,24 Garden Ave. Ithaca NY 14853

3 Load Comma-Delimited Data Files

LOAD DATA LOCAL INPATH '/empdata-comma.txt'
OVERWRITE INTO TABLE employees12;

The LOCAL keyword:

hive> select * from employees12;
OK
John Smith  100000.0    0.2 0.05    0.1 1 Michigan Ave. Chicago IL 60600
Mary Smith  80000.0 0.2 0.05    0.1 100 Ontario St. Chicago IL 60601
Todd Jones  70000.0 0.15    0.03    0.1 108 Oak Park Chicago IL 60700
Bill King   60000.0 0.15    0.03    0.1 10 Ontario St. Chicago IL 60700
Sam Kim 110000.0    0.2 0.05    0.1 101 Uptown Rd. Ithaca NY 14850
Maria Booth 60000.0 0.15    0.03    0.1 46 South Gate Dr. State College PA 16801
Joe Howe    75000.0 0.2 0.5 0.1 24 Garden Ave. Ithaca NY 14853

4 Load Tab-Delimited Data File

CREATE TABLE IF NOT EXISTS employees11 (
  name STRING COMMENT 'Employee name',
  salary FLOAT COMMENT 'Employee salary',
  FedTax FLOAT,
  StateTax FLOAT,
  Insurance FLOAT,
  address STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n';
Todd Jones  70000.0 .15 .03 .1  108 Oak Park Chicago IL 60700
Bill King   60000.0 .15 .03 .1  10 Ontario St. Chicago IL 60700
Sam Kim 110000.0    .2  .05 .1  101 Uptown Rd. Ithaca NY 14850
Maria Booth 60000.0 .15 .03 .1  46 South Gate Dr. State College PA 16801
Joe Howe    75000.0 .2  .5  .1  24 Garden Ave. Ithaca NY 14853

5 Load Tab-Delimited Data Files

LOAD DATA LOCAL INPATH '/empdata-tab.txt'
OVERWRITE INTO TABLE employees11;
Time taken: 2.4 seconds
hive> select * from employees11;
OK
John Smith  100000.0    0.2 0.05    0.1 1 Michigan Ave. Chicago IL 60600
Mary Smith  80000.0 0.2 0.05    0.1 100 Ontario St. Chicago IL 60601
Todd Jones  70000.0 0.15    0.03    0.1 108 Oak Park Chicago IL 60700
Bill King   60000.0 0.15    0.03    0.1 10 Ontario St. Chicago IL 60700
Sam Kim 110000.0    0.2 0.05    0.1 101 Uptown Rd. Ithaca NY 14850
Maria Booth 60000.0 0.15    0.03    0.1 46 South Gate Dr. State College PA 16801
Joe Howe    75000.0 0.2 0.5 0.1 24 Garden Ave. Ithaca NY 14853

6 Load Data of Complex Data Types

CREATE TABLE IF NOT EXISTS employees20 (
  name STRING COMMENT 'Employee name',
  salary FLOAT COMMENT 'Employee salary',
  subordinates ARRAY<STRING> COMMENT 'Names of subordinates',
  deductions MAP<STRING, FLOAT>
    COMMENT 'Keys are deductions names, values are percentages',
  address STRUCT<street:STRING, city:STRING, state:STRING, ZIP:INT>
    COMMENT 'Home address')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '^'
COLLECTION ITEMS TERMINATED BY '#'
MAP KEYS TERMINATED BY '@'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
John Smith^100000.0^Mary Smith#David Wells^Federal Taxes@0.2#State Taxes@0.05#Insurance@0.1^1 Michigan Ave.#Chicago#IL#60600
Mary Smith^80000.0^Bill King^Federal Taxes@0.2#State Taxes@0.05#Insurance@0.1^100 Ontario St.#Chicago#IL#60601
Jason Yang^150000.0^Diana Foster#Kelli Doe#Kevin Hoover^Federal Taxes@0.2#State Taxes@0.05#Insurance@0.1^101 Uptown Rd.#Ithaca#NY#14850
Ted Wang^120000.0^Diana Johnson#Kevin Li#Dan Jones^Federal Taxes@0.2#State Taxes@0.05#Insurance@0.1^21 Farm Rd.#Ithaca#NY#14850
Mike McPheron^150000.0^Marvin Xu#Elise Lee^Federal Taxes@0.02#State Taxes@0.05#Insurance@0.1^12 Bush Ave.#Ithaca#NY#14850

7 Load Data of Complex Data Types

LOAD DATA LOCAL INPATH '/empdata.txt'
OVERWRITE INTO TABLE employees20;
hive> select * from employees20;
OK
John Smith  100000.0    ["Mary Smith","David Wells"]    {"Federal Taxes":0.2,"State Taxes":0.05,"Insurance":0.1}    {"street":"1 Michigan Ave.","city":"Chicago","state":"IL","zip":60600}
Mary Smith  80000.0 ["Bill King"]   {"Federal Taxes":0.2,"State Taxes":0.05,"Insurance":0.1}    {"street":"100 Ontario St.","city":"Chicago","state":"IL","zip":60601}
Jason Yang  150000.0    ["Diana Foster","Kelli Doe","Kevin Hoover"] {"Federal Taxes":0.2,"State Taxes":0.05,"Insurance":0.1}    {"street":"101 Uptown Rd.","city":"Ithaca","state":"NY","zip":14850}
Ted Wang    120000.0    ["Diana Johnson","Kevin Li","Dan Jones"]    {"Federal Taxes":0.2,"State Taxes":0.05,"Insurance":0.1}    {"street":"21 Farm Rd.","city":"Ithaca","state":"NY","zip":14850}
Mike McPheron   150000.0    ["Marvin Xu","Elise Lee"]   {"Federal Taxes":0.02,"State Taxes":0.05,"Insurance":0.1}   {"street":"12 Bush Ave.","city":"Ithaca","state":"NY","zip":14850}
Time taken: 0.58 seconds, Fetched: 5 row(s)

8 Load Data of Complex Data Types into a Partitioned Table

CREATE TABLE IF NOT EXISTS employees23 (
  name STRING COMMENT 'Employee name',
  salary FLOAT COMMENT 'Employee salary',
  subordinates ARRAY<STRING> COMMENT 'Names of subordinates',
  deductions MAP<STRING, FLOAT>
    COMMENT 'Keys are deductions names, values are percentages',
  address STRUCT<street:STRING, city:STRING, state:STRING, ZIP:INT>
    COMMENT 'Home address')
PARTITIONED BY (country STRING, state STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '^'
COLLECTION ITEMS TERMINATED BY '#'
MAP KEYS TERMINATED BY '@'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

9 Load Data of Complex Data Types into a Partitioned Table (cont’d)

LOAD DATA LOCAL INPATH '/empdata-us-il.txt'
OVERWRITE INTO TABLE employees23
PARTITION (country = 'US', state = 'IL');


10 Query Partitioned Table of Complex Data Types

Select * from employees23;

11 Load Data of Complex Data Types into a Partitioned Table (cont’d)

LOAD DATA LOCAL INPATH '/empdata-us-ny.txt'
OVERWRITE INTO TABLE employees23
PARTITION (country = 'US', state = 'NY');


12 Load Data of Complex Data Types into a Partitioned Table (cont’d)

LOAD DATA LOCAL INPATH '/empdata-us-ca.txt'
OVERWRITE INTO TABLE employees23
PARTITION (country = 'US', state = 'CA');


13 Query Partitioned Table of Complex Data Types

See the whole table with three partitions:

Select * from employees23;

13.1 Using a Condition from a Partition

Select * from employees23 where state="NY";

13.2 Using a Condition from a Column (struct) field

Select * from employees23 where address.state="NY";

13.3 Using a Condition from a Column

Select name, salary, address.city, address.state
from employees23
where salary > 120000;

13.4 Using Conditions from Both a Column and a Partition

Select name, salary, address.city, address.state
from employees23
where salary > 120000 and (state = 'CA' or state = 'IL');

14 The Sub-Directories of a Partitioned Table

Those of the employees23 table:

.../employees23/country=US/state=CA
.../employees23/country=US/state=IL
.../employees23/country=US/state=NY

15 The Sub-Directories and the Files of a Partitioned Table

The location of the loaded file and its contents:

16 “Insert” Data into Tables from Queries

CREATE TABLE inserttbl like employees20;

INSERT OVERWRITE TABLE inserttbl
SELECT * FROM employees20
WHERE salary > 100000.0;

SELECT * FROM inserttbl;

17 Insert Data into Tables from Queries

INSERT OVERWRITE TABLE employees
PARTITION (country = 'US', state = 'OR')
SELECT * FROM staged_employees se
WHERE se.cnty = 'US' AND se.st = 'OR';
INSERT INTO TABLE employees
PARTITION (country = 'US', state = 'OR')
SELECT * FROM staged_employees se
WHERE se.cnty = 'US' AND se.st = 'OR';

18 An INSERT Syntax: Scan the Data Once and Split it Multiple Ways

FROM staged_employees se
INSERT OVERWRITE TABLE employees
  PARTITION (country = 'US', state = 'OR')
  SELECT * WHERE se.cnty = 'US' AND se.st = 'OR'
INSERT OVERWRITE TABLE employees
  PARTITION (country = 'US', state = 'CA')
  SELECT * WHERE se.cnty = 'US' AND se.st = 'CA'
INSERT OVERWRITE TABLE employees
  PARTITION (country = 'US', state = 'IL')
  SELECT * WHERE se.cnty = 'US' AND se.st = 'IL';

19 Dynamic Partition Inserts

Hive’s dynamic partition can infer the partitions to create based on query parameters:

INSERT OVERWRITE TABLE employees
PARTITION (country, state)
SELECT ..., se.cnty, se.st
FROM staged_employees se;

In this example, Hive determines the values of the partition keys, country and state, from the last two columns in the SELECT clause

Different names in staged_employees emphasize that the relationship between the source column values and the output partition values is by position only and not by matching on names

20 Mix Dynamic and Static Partitions

Static value for the country (US) and a dynamic value for the state. The static partition keys must come before the dynamic partition keys

INSERT OVERWRITE TABLE employees
PARTITION (country = 'US', state)
SELECT ..., se.cnty, se.st
FROM staged_employees se
WHERE se.cnty = 'US';

21 Dynamic Partitions Properties

Name Default Description
hive.exec.dynamic.partition false Set to true to enable dynamic partitioning
hive.exec.dynamic.partition.mode strict Set to nonstrict to enable all partitions to be determined dynamically
hive.exec.max.dynamic.partitions.pernode 100 The maximum number of dynamic partitions that can be created by each mapper or reducer. Raises a fatal error if one mapper or reducer attempts to create more than the threshold
hive.exec.max.dynamic.partitions +1000 The total number of dynamic partitions that can be created by one statement with dynamic partitioning. Raises a fatal error if the limit is exceeded
hive.exec.max.created.files 100000 The maximum total number of files that can be created globally. A Hadoop counter is used to track the number of files created. Raises a fatal error if the limit is exceeded

22 Set the Desired Properties Just Before Use

hive> set hive.exec.dynamic.partition=true;
hive> set hive.exec.dynamic.partition.mode=nonstrict;
hive> set hive.exec.max.dynamic.partitions.pernode=1000;
hive> INSERT OVERWRITE TABLE employees
    > PARTITION (country, state)
    > SELECT ..., se.cty, se.st
    > FROM staged_employees se;

23 Create Tables and Load Them in One Query

CREATE TABLE ca_employees
AS SELECT name, salary, address
FROM employees
WHERE state = 'CA';

24 Export Data

INSERT OVERWRITE LOCAL DIRECTORY '/tmp/ca_employees'
SELECT name, salary, address
FROM employees
WHERE state = 'CA';

Independent of how the data is actually stored in the source table, it is written to files with all fields serialized as strings

hive> ! ls /tmp/ca_employees;
000000_0
hive> ! cat /tmp/ca_employees/000000_0
John Doe100000.0201 San Antonio CircleMountain ViewCA94040
Mary Smith80000.01 Infinity LoopCupertinoCA95014
...